//
// Project: Disagreement in science: Missing women



clear all
version 15.1  



//
// set locals

// method of identifying gender
local female "female_genderize"
local male "male_genderize"

// gender of author is known
local known_gender "female_genderize!=."

// field controls aer
local field_controls_aer "i.field_microeconomics i.field_theory i.field_macroeconomics i.field_labor i.field_econometrics i.field_io i.field_international i.field_finance i.field_public i.field_health_urban i.field_development i.field_history i.field_lab i.field_other"

// field controls Nature
local field_controls_nature "i.earth i.physical i.social i.biological i.health"

// field controls PNAS
local field_controls_pnas "i.type_num"



//
// AER

// call data
use "${data}/output/aer_data_gender_wos.dta", clear
*drop if month=="May"  // exclude AEA papers and proceedings
drop if year==2020  
keep if (comment | research_article)

// keep correct wos matches, and manually enter citation count data (from wos) for comments with initial poor matches
keep if author_id==1
matchit title ArticleTitle
order ArticleTitle similscore, after(title)
sort comment similscore
keep if similscore>0.5  // no need to enter citation counts manually for comments

// paper and proceedings indicator
generate aeapp = month=="May" & year<2019

// rescale citation count by average citation count in the field and year
regress AverageperYear i.year `field_controls_aer' if comment==0 & aeapp==0
predict precited_averageperyear  // out of sample (comments and aeapp) predictions as well
sum AverageperYear if comment==0 & aeapp==0
generate total_mean_cites = r(mean)
generate adjusted_avgcites = AverageperYear * (total_mean_cites/precited_averageperyear)

// analysis
local i=1
forvalues j=0/1 {
	sum adjusted_avgcites if comment==`j' & author_id==1 & aeapp==0, d
	scalar p1_`i'_`j'=r(p1)
	scalar p5_`i'_`j'=r(p5)
	scalar p25_`i'_`j'=r(p25)
	scalar p50_`i'_`j'=r(p50)
	scalar p75_`i'_`j'=r(p75)
	scalar p95_`i'_`j'=r(p95)
	scalar p99_`i'_`j'=r(p99)
	scalar max_`i'_`j'=r(max)
}
local j=2
sum adjusted_avgcites if comment==0 & author_id==1 & aeapp==1, d
scalar p1_`i'_`j'=r(p1)
scalar p5_`i'_`j'=r(p5)
scalar p25_`i'_`j'=r(p25)
scalar p50_`i'_`j'=r(p50)
scalar p75_`i'_`j'=r(p75)
scalar p95_`i'_`j'=r(p95)
scalar p99_`i'_`j'=r(p99)
scalar max_`i'_`j'=r(max)



//
// ASR

// call data
use "${data}/output/asr_data_gender_wos.dta", clear
keep if (comment | research_article)

// keep correct wos matches, and manually enter citation count data (from wos) for comments with initial poor matches
keep if author_id==1
matchit title ArticleTitle
order ArticleTitle similscore, after(title)
sort comment similscore  // no need to enter citation counts manually for comments

// rescale citation count by average citation count in the year
regress AverageperYear i.year if comment==0
predict precited_averageperyear  // out of sample (comments) predictions as well
sum AverageperYear if comment==0
generate total_mean_cites = r(mean)
generate adjusted_avgcites = AverageperYear * (total_mean_cites/precited_averageperyear)

// analysis
local i=2
forvalues j=0/1 {
	sum adjusted_avgcites if comment==`j' & author_id==1, d
	scalar p1_`i'_`j'=r(p1)
	scalar p5_`i'_`j'=r(p5)
	scalar p25_`i'_`j'=r(p25)
	scalar p50_`i'_`j'=r(p50)
	scalar p75_`i'_`j'=r(p75)
	scalar p95_`i'_`j'=r(p95)
	scalar p99_`i'_`j'=r(p99)
	scalar max_`i'_`j'=r(max)
}



//
// JAMA

// call data
use "${data}/output/jama_pubmed_data_gender_wos.dta", clear
drop if year==2020  
drop if year<2002  // full author names from PubMed not available
drop if year<2013  // Comment & Response section started in July 2013
drop if year==2013 & month=="January"  // Comment & Response section started in July 2013
drop if year==2013 & month=="February"  // Comment & Response section started in July 2013
drop if year==2013 & month=="March"  // Comment & Response section started in July 2013
drop if year==2013 & month=="April"  // Comment & Response section started in July 2013
drop if year==2013 & month=="May"  // Comment & Response section started in July 2013
drop if year==2013 & month=="June"  // Comment & Response section started in July 2013 
keep if comment | research_article
drop if article_with_etal
drop if strpos(full_name, "Fontanarosa")  // this JAMA editor appeared as first author of letters to the editor

// keep correct wos matches, and manually enter citation count data (from wos) for comments with initial poor matches
keep if author_id==1
matchit title ArticleTitle
order ArticleTitle similscore, after(title)
sort comment similscore  
drop if similscore<0.62  // after visual inspection

// rescale citation count by average citation count in the year
regress AverageperYear i.year if comment==0
predict precited_averageperyear  // out of sample (comments) predictions as well
sum AverageperYear if comment==0
generate total_mean_cites = r(mean)
generate adjusted_avgcites = AverageperYear * (total_mean_cites/precited_averageperyear)

// analysis
local i=3
forvalues j=0/1 {
	sum adjusted_avgcites if comment==`j' & author_id==1, d
	scalar p1_`i'_`j'=r(p1)
	scalar p5_`i'_`j'=r(p5)
	scalar p25_`i'_`j'=r(p25)
	scalar p50_`i'_`j'=r(p50)
	scalar p75_`i'_`j'=r(p75)
	scalar p95_`i'_`j'=r(p95)
	scalar p99_`i'_`j'=r(p99)
	scalar max_`i'_`j'=r(max)
}



//
// Nature

// call data
use "${data}/output/nature_data_gender_wos.dta", clear
drop if year==2020
keep if comment | research_article

// merge with field information
merge m:1 article_id using "${data}/output/nature_matched_ids.dta"
keep if _merge==3

// keep correct wos matches, and manually enter citation count data (from wos) for comments with initial poor matches
keep if author_id==1
matchit title ArticleTitle
order ArticleTitle similscore, after(title)
sort comment similscore  
drop if comment==0 & similscore<0.45
replace AverageperYear=2 if article_id==26465  // from Google Scholar Feb 2021
replace AverageperYear=3.71 if article_id==22895  // from Google Scholar Feb 2021
replace AverageperYear=0.07 if article_id==5556  // from Google Scholar Feb 2021
replace AverageperYear=3.1 if article_id==17206  // from Google Scholar Feb 2021

// rescale citation count by average citation count in the field and year
regress AverageperYear i.year `field_controls_nature' if comment==0
predict precited_averageperyear  // out of sample (comments) predictions as well
sum AverageperYear if comment==0
generate total_mean_cites = r(mean)
generate adjusted_avgcites = AverageperYear * (total_mean_cites/precited_averageperyear)

// analysis
local i=4
forvalues j=0/1 {
	sum adjusted_avgcites if comment==`j' & author_id==1, d
	scalar p1_`i'_`j'=r(p1)
	scalar p5_`i'_`j'=r(p5)
	scalar p25_`i'_`j'=r(p25)
	scalar p50_`i'_`j'=r(p50)
	scalar p75_`i'_`j'=r(p75)
	scalar p95_`i'_`j'=r(p95)
	scalar p99_`i'_`j'=r(p99)
	scalar max_`i'_`j'=r(max)
}



//
// PNAS 

// call data
use "${data}/output/pnas_data_gender_wos.dta", clear
drop if full_name=="II" | full_name=="III" | full_name=="IV" | full_name=="Jr" | full_name=="Jr."  // erroneously scraped as separate author-article observations
drop if year==2020 | year<2008  // PNAS started comments in 2008
keep if comment | research_article

// keep correct wos matches, and manually enter citation count data (from wos) for comments with initial poor matches
keep if author_id==1
matchit title ArticleTitle
order ArticleTitle similscore, after(title)
sort comment similscore  
foreach n of numlist 58108 58119 41966 70221 36835 44580 47008 69073 82710 46339 67815 59305 36979 53801 {
	drop if article_id==`n'
}
replace AverageperYear=0 if article_id==34110  // from Google Scholar Feb 2021
replace AverageperYear=0.08 if article_id==35412  // from Google Scholar Feb 2021
replace AverageperYear=0.42 if article_id==36092  // from Google Scholar Feb 2021
replace AverageperYear=1.75 if article_id==34269  // from Google Scholar Feb 2021
replace AverageperYear=1.5 if article_id==35516  // from Google Scholar Feb 2021
replace AverageperYear=0.58 if article_id==35573  // from Google Scholar Feb 2021
replace AverageperYear=0.58 if article_id==35625  // from Google Scholar Feb 2021
replace AverageperYear=0.25 if article_id==35749  // from Google Scholar Feb 2021
replace AverageperYear=0.75 if article_id==71812  // from Google Scholar Feb 2021
replace AverageperYear=1.08 if article_id==34645  // from Google Scholar Feb 2021
replace AverageperYear=14.67 if article_id==75797  // from Google Scholar Feb 2021
replace AverageperYear=1.18 if article_id==39872  // from Google Scholar Feb 2021
replace AverageperYear=38 if article_id==75796  // from Google Scholar Feb 2021
replace AverageperYear=1.33 if article_id==35871  // from Google Scholar Feb 2021
replace AverageperYear=0.67 if article_id==37102  // from Google Scholar Feb 2021
replace AverageperYear=0.58 if article_id==36090  // from Google Scholar Feb 2021
replace AverageperYear=1.42 if article_id==35255  // from Google Scholar Feb 2021
replace AverageperYear=0.42 if article_id==35679  // from Google Scholar Feb 2021
replace AverageperYear=0.75 if article_id==36093  // from Google Scholar Feb 2021
replace AverageperYear=0.58 if article_id==36238  // from Google Scholar Feb 2021
replace AverageperYear=1.67 if article_id==35517  // from Google Scholar Feb 2021
replace AverageperYear=1.25 if article_id==37103  // from Google Scholar Feb 2021
replace AverageperYear=1.08 if article_id==35808  // from Google Scholar Feb 2021
replace AverageperYear=0.18 if article_id==39175  // from Google Scholar Feb 2021
replace AverageperYear=1.17 if article_id==36091  // from Google Scholar Feb 2021
replace AverageperYear=1.33 if article_id==35472  // from Google Scholar Feb 2021
replace AverageperYear=0.58 if article_id==35934  // from Google Scholar Feb 2021
replace AverageperYear=2 if article_id==64502  // from Google Scholar Feb 2021
replace AverageperYear=1.17 if article_id==34813  // from Google Scholar Feb 2021
replace AverageperYear=0.58 if article_id==35747  // from Google Scholar Feb 2021
replace AverageperYear=1.75 if article_id==34378  // from Google Scholar Feb 2021
replace AverageperYear=0.83 if article_id==35305  // from Google Scholar Feb 2021
replace AverageperYear=8.33 if article_id==75795  // from Google Scholar Feb 2021
replace AverageperYear=1.67 if article_id==37183  // from Google Scholar Feb 2021
replace AverageperYear=4.83 if article_id==36415  // from Google Scholar Feb 2021
replace AverageperYear=0.75 if article_id==35306  // from Google Scholar Feb 2021

// generate field information for comments based on article-comment links
generate type_num = .
replace type_num = 1 if type=="Biological Sciences"
replace type_num = 2 if type=="Physical Sciences"
replace type_num = 3 if type=="Social Sciences"
local N = _N
forvalues i=1/`N' {
	if call_to[`i'] != . {
		local id = call_to[`i']
		sum type_num if article_id==`id'
		replace type_num = r(mean) if _n==`i'
	}
}

// rescale citation count by average citation count in the field and year
regress AverageperYear i.year `field_controls_pnas' if comment==0
predict precited_averageperyear  // out of sample (comments) predictions as well
sum AverageperYear if comment==0
generate total_mean_cites = r(mean)
generate adjusted_avgcites = AverageperYear * (total_mean_cites/precited_averageperyear)

// analysis
local i=5
forvalues j=0/1 {
	sum adjusted_avgcites if comment==`j' & author_id==1, d
	scalar p1_`i'_`j'=r(p1)
	scalar p5_`i'_`j'=r(p5)
	scalar p25_`i'_`j'=r(p25)
	scalar p50_`i'_`j'=r(p50)
	scalar p75_`i'_`j'=r(p75)
	scalar p95_`i'_`j'=r(p95)
	scalar p99_`i'_`j'=r(p99)
	scalar max_`i'_`j'=r(max)
}



//
// Science

// call data
use "${data}/output/science_data_gender_wos.dta", clear
drop if year==2020
keep if comment | research_article

// keep correct wos matches, and manually enter citation count data (from wos) for comments with initial poor matches
keep if author_id==1
matchit title ArticleTitle
order ArticleTitle similscore, after(title)
sort comment similscore  
foreach n of numlist 46580 38529 46230 5474 93 {
	drop if article_id==`n'
}
replace AverageperYear=0.38 if article_id==3262  // from Google Scholar Feb 2021
replace AverageperYear=0.24 if article_id==3102  // from Google Scholar Feb 2021
replace AverageperYear=1.24 if article_id==4013  // from Google Scholar Feb 2021
replace AverageperYear=1.24 if article_id==3726  // from Google Scholar Feb 2021
replace AverageperYear=1.16 if article_id==8610  // from Google Scholar Feb 2021
replace AverageperYear=1.77 if article_id==1656  // from Google Scholar Feb 2021
replace AverageperYear=9.09 if article_id==2463  // from Google Scholar Feb 2021
replace AverageperYear=0.25 if article_id==6473  // from Google Scholar Feb 2021
replace AverageperYear=0.33 if article_id==3814  // from Google Scholar Feb 2021
replace AverageperYear=2.75 if article_id==6062  // from Google Scholar Feb 2021
replace AverageperYear=2.6 if article_id==6227  // from Google Scholar Feb 2021
replace AverageperYear=1.05 if article_id==7679  // from Google Scholar Feb 2021
replace AverageperYear=0.77 if article_id==3770  // from Google Scholar Feb 2021
replace AverageperYear=1.35 if article_id==5429  // from Google Scholar Feb 2021
replace AverageperYear=1.48 if article_id==4975  // from Google Scholar Feb 2021
replace AverageperYear=1.82 if article_id==2287  // from Google Scholar Feb 2021
replace AverageperYear=8.38 if article_id==3574  // from Google Scholar Feb 2021
replace AverageperYear=1.14 if article_id==1565  // from Google Scholar Feb 2021
replace AverageperYear=0.37 if article_id==8861  // from Google Scholar Feb 2021
replace AverageperYear=2.36 if article_id==1078  // from Google Scholar Feb 2021
replace AverageperYear=2.52 if article_id==3382  // from Google Scholar Feb 2021
replace AverageperYear=0.1 if article_id==3688  // from Google Scholar Feb 2021
replace AverageperYear=2 if article_id==6752  // from Google Scholar Feb 2021
replace AverageperYear=0.47 if article_id==8374  // from Google Scholar Feb 2021
replace AverageperYear=0.24 if article_id==5140  // from Google Scholar Feb 2021
replace AverageperYear=0.05 if article_id==784  // from Google Scholar Feb 2021
replace AverageperYear=0.45 if article_id==7520  // from Google Scholar Feb 2021
replace AverageperYear=1.89 if article_id==8183  // from Google Scholar Feb 2021
replace AverageperYear=1.23 if article_id==1521  // from Google Scholar Feb 2021
replace AverageperYear=0.47 if article_id==8279  // from Google Scholar Feb 2021
replace AverageperYear=0.64 if article_id==2027  // from Google Scholar Feb 2021
replace AverageperYear=0.05 if article_id==2512  // from Google Scholar Feb 2021
replace AverageperYear=5.2 if article_id==7941  // from Google Scholar Feb 2021
replace AverageperYear=3 if article_id==4245  // from Google Scholar Feb 2021
replace AverageperYear=20.95 if article_id==7113  // from Google Scholar Feb 2021
replace AverageperYear=1.38 if article_id==2726  // from Google Scholar Feb 2021
replace AverageperYear=8.15 if article_id==6313  // from Google Scholar Feb 2021
replace AverageperYear=3.24 if article_id==2725  // from Google Scholar Feb 2021
replace AverageperYear=0.86 if article_id==4671  // from Google Scholar Feb 2021
replace AverageperYear=0.36 if article_id==282  // from Google Scholar Feb 2021
replace AverageperYear=2 if article_id==3533  // from Google Scholar Feb 2021
replace AverageperYear=0.23 if article_id==2676  // from Google Scholar Feb 2021
replace AverageperYear=2.95 if article_id==8703  // from Google Scholar Feb 2021
replace AverageperYear=5.05 if article_id==6414  // from Google Scholar Feb 2021
replace AverageperYear=0.2 if article_id==7423  // from Google Scholar Feb 2021
replace AverageperYear=0.52 if article_id==4337  // from Google Scholar Feb 2021
replace AverageperYear=0.95 if article_id==7260  // from Google Scholar Feb 2021
replace AverageperYear=0.14 if article_id==3862  // from Google Scholar Feb 2021
replace AverageperYear=2.43 if article_id==3618  // from Google Scholar Feb 2021
replace AverageperYear=0.5 if article_id==5728  // from Google Scholar Feb 2021
replace AverageperYear=2.82 if article_id==2343  // from Google Scholar Feb 2021
replace AverageperYear=0.35 if article_id==6355  // from Google Scholar Feb 2021
replace AverageperYear=2.27 if article_id==1473  // from Google Scholar Feb 2021
replace AverageperYear=7.86 if article_id==5024  // from Google Scholar Feb 2021
replace AverageperYear=1.71 if article_id==5249  // from Google Scholar Feb 2021
replace AverageperYear=1.75 if article_id==5783  // from Google Scholar Feb 2021
replace AverageperYear=0.33 if article_id==3971  // from Google Scholar Feb 2021
replace AverageperYear=2.24 if article_id==3215  // from Google Scholar Feb 2021
replace AverageperYear=0.23 if article_id==92  // from Google Scholar Feb 2021
replace AverageperYear=0.26 if article_id==8751  // from Google Scholar Feb 2021
replace AverageperYear=2.65 if article_id==7069  // from Google Scholar Feb 2021
replace AverageperYear=0.43 if article_id==4198  // from Google Scholar Feb 2021
replace AverageperYear=1.47 if article_id==8232  // from Google Scholar Feb 2021
replace AverageperYear=1.11 if article_id==8906  // from Google Scholar Feb 2021
replace AverageperYear=0.4 if article_id==5784  // from Google Scholar Feb 2021
replace AverageperYear=1.85 if article_id==7211  // from Google Scholar Feb 2021
replace AverageperYear=0.48 if article_id==3687  // from Google Scholar Feb 2021
replace AverageperYear=2.6 if article_id==5833  // from Google Scholar Feb 2021
replace AverageperYear=1.67 if article_id==49288  // from Google Scholar Feb 2021
replace AverageperYear=2.86 if article_id==380  // from Google Scholar Feb 2021
replace AverageperYear=2.33 if article_id==3476  // from Google Scholar Feb 2021
replace AverageperYear=1 if article_id==2118  // from Google Scholar Feb 2021
replace AverageperYear=0.52 if article_id==4081  // from Google Scholar Feb 2021
replace AverageperYear=2.14 if article_id==5198  // from Google Scholar Feb 2021
replace AverageperYear=2.5 if article_id==2563  // from Google Scholar Feb 2021
replace AverageperYear=0.65 if article_id==7841  // from Google Scholar Feb 2021
replace AverageperYear=1.90 if article_id==4590  // from Google Scholar Feb 2021
replace AverageperYear=0.59 if article_id==1910  // from Google Scholar Feb 2021
replace AverageperYear=0.7 if article_id==5619  // from Google Scholar Feb 2021
replace AverageperYear=0.27 if article_id==2395  // from Google Scholar Feb 2021
replace AverageperYear=0.33 if article_id==2956  // from Google Scholar Feb 2021
replace AverageperYear=0.75 if article_id==6709  // from Google Scholar Feb 2021
replace AverageperYear=1.38 if article_id==5313  // from Google Scholar Feb 2021
replace AverageperYear=1.3 if article_id==7164  // from Google Scholar Feb 2021
replace AverageperYear=3.05 if article_id==5095  // from Google Scholar Feb 2021
replace AverageperYear=0.15 if article_id==5673  // from Google Scholar Feb 2021
replace AverageperYear=1.19 if article_id==3008  // from Google Scholar Feb 2021
replace AverageperYear=0.64 if article_id==1406  // from Google Scholar Feb 2021
replace AverageperYear=1.05 if article_id==5573  // from Google Scholar Feb 2021
replace AverageperYear=0.38 if article_id==3725  // from Google Scholar Feb 2021
replace AverageperYear=0.48 if article_id==4293  // from Google Scholar Feb 2021
replace AverageperYear=0.68 if article_id==8540  // from Google Scholar Feb 2021
replace AverageperYear=0.09 if article_id==2622  // from Google Scholar Feb 2021
replace AverageperYear=4.16 if article_id==8095  // from Google Scholar Feb 2021
replace AverageperYear=2.71 if article_id==4457  // from Google Scholar Feb 2021
replace AverageperYear=3.59 if article_id==681  // from Google Scholar Feb 2021
replace AverageperYear=1.37 if article_id==8979  // from Google Scholar Feb 2021
replace AverageperYear=1.75 if article_id==6128  // from Google Scholar Feb 2021
replace AverageperYear=0.32 if article_id==1705  // from Google Scholar Feb 2021
replace AverageperYear=0.8 if article_id==5572  // from Google Scholar Feb 2021
replace AverageperYear=0.14 if article_id==2237  // from Google Scholar Feb 2021
replace AverageperYear=3.5 if article_id==5524  // from Google Scholar Feb 2021
replace AverageperYear=3.5 if article_id==49849  // from Google Scholar Feb 2021
replace AverageperYear=6.55 if article_id==2621  // from Google Scholar Feb 2021
replace AverageperYear=2.81 if article_id==5367  // from Google Scholar Feb 2021
replace AverageperYear=0.81 if article_id==2768  // from Google Scholar Feb 2021
replace AverageperYear=0.14 if article_id==1610  // from Google Scholar Feb 2021
replace AverageperYear=2 if article_id==7577  // from Google Scholar Feb 2021
replace AverageperYear=3.5 if article_id==7308  // from Google Scholar Feb 2021
replace AverageperYear=4.2 if article_id==6883  // from Google Scholar Feb 2021
replace AverageperYear=3.6 if article_id==6794  // from Google Scholar Feb 2021
replace AverageperYear=0.37 if article_id==8493  // from Google Scholar Feb 2021
replace AverageperYear=2.3 if article_id==6950  // from Google Scholar Feb 2021
replace AverageperYear=1.84 if article_id==8655  // from Google Scholar Feb 2021
replace AverageperYear=1.24 if article_id==4915  // from Google Scholar Feb 2021
replace AverageperYear=2.38 if article_id==3815  // from Google Scholar Feb 2021
replace AverageperYear=0 if article_id==728  // from Google Scholar Feb 2021
replace AverageperYear=7.32 if article_id==630  // from Google Scholar Feb 2021
replace AverageperYear=2.05 if article_id==2819  // from Google Scholar Feb 2021
replace AverageperYear=3.57 if article_id==3928  // from Google Scholar Feb 2021
replace AverageperYear=0.57 if article_id==5139  // from Google Scholar Feb 2021
replace AverageperYear=7.15 if article_id==6624  // from Google Scholar Feb 2021
replace AverageperYear=0.32 if article_id==2344  // from Google Scholar Feb 2021
replace AverageperYear=0.09 if article_id==1863  // from Google Scholar Feb 2021
replace AverageperYear=1.23 if article_id==1821  // from Google Scholar Feb 2021
replace AverageperYear=0.23 if article_id==1757  // from Google Scholar Feb 2021
replace AverageperYear=0.33 if article_id==4123  // from Google Scholar Feb 2021

// rescale citation count by average citation count in the year
regress AverageperYear i.year if comment==0
predict precited_averageperyear  // out of sample (comments) predictions as well
sum AverageperYear if comment==0
generate total_mean_cites = r(mean)
generate adjusted_avgcites = AverageperYear * (total_mean_cites/precited_averageperyear)

// analysis
local i=6
forvalues j=0/1 {
	sum adjusted_avgcites if comment==`j' & author_id==1, d
	scalar p1_`i'_`j'=r(p1)
	scalar p5_`i'_`j'=r(p5)
	scalar p25_`i'_`j'=r(p25)
	scalar p50_`i'_`j'=r(p50)
	scalar p75_`i'_`j'=r(p75)
	scalar p95_`i'_`j'=r(p95)
	scalar p99_`i'_`j'=r(p99)
	scalar max_`i'_`j'=r(max)
}

